Data Inspection¶
import plotly
plotly.offline.init_notebook_mode()
import pandas as pd
from tqdm import tqdm
df_list = list()
chunk_iter = pd.read_csv(
"../data/Total_Data_10Y_Top24.csv",
chunksize=100000,
dtype = {"CANCELLATION_CODE": str}
)
for chunk in tqdm(chunk_iter):
df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:28, 4.52it/s]
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 28.0 | 1214.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 13.0 | 945.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | 6.0 | 1341.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 7.0 | 1159.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 6.0 | 2317.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 24 columns
df.describe()
| ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEP_TIME | DEP_DELAY | TAXI_OUT | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.271494e+07 | 1.271486e+07 | 1.271048e+07 | 1.270744e+07 | 1.270744e+07 | 1.268139e+07 | 1.292656e+07 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 |
| mean | 1.298824e+06 | 3.161338e+04 | 1.298812e+06 | 3.161370e+04 | 1.332409e+03 | 1.127152e+01 | 1.722694e+01 | 8.678440e+00 | 1.471801e+03 | 5.409690e+00 | 1.675551e-02 | 2.041558e+01 | 2.650336e+00 | 1.520350e+01 | 1.268451e-01 | 2.413592e+01 |
| std | 1.453419e+05 | 1.168922e+03 | 1.453231e+05 | 1.168763e+03 | 5.192380e+02 | 4.514114e+01 | 9.141093e+00 | 6.704624e+00 | 5.554134e+02 | 4.738376e+01 | 1.283540e-01 | 5.935956e+01 | 2.208897e+01 | 3.191662e+01 | 3.210375e+00 | 5.163883e+01 |
| min | 1.039705e+06 | 3.019400e+04 | 1.039705e+06 | 3.019400e+04 | 1.000000e+00 | -2.340000e+02 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | -2.380000e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.129806e+06 | 3.046600e+04 | 1.129806e+06 | 3.046600e+04 | 9.060000e+02 | -5.000000e+00 | 1.200000e+01 | 5.000000e+00 | 1.052000e+03 | -1.400000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 1.289208e+06 | 3.145400e+04 | 1.289208e+06 | 3.145400e+04 | 1.324000e+03 | -1.000000e+00 | 1.500000e+01 | 7.000000e+00 | 1.515000e+03 | -5.000000e+00 | 0.000000e+00 | 3.000000e+00 | 0.000000e+00 | 2.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 1.410702e+06 | 3.245700e+04 | 1.410702e+06 | 3.245700e+04 | 1.751000e+03 | 9.000000e+00 | 2.000000e+01 | 1.000000e+01 | 1.930000e+03 | 9.000000e+00 | 0.000000e+00 | 2.000000e+01 | 0.000000e+00 | 1.900000e+01 | 0.000000e+00 | 2.800000e+01 |
| max | 1.501606e+06 | 3.481900e+04 | 1.501606e+06 | 3.481900e+04 | 2.400000e+03 | 3.695000e+03 | 2.270000e+02 | 1.419000e+03 | 2.400000e+03 | 3.680000e+03 | 1.000000e+00 | 3.359000e+03 | 2.692000e+03 | 1.511000e+03 | 9.870000e+02 | 3.581000e+03 |
df.FL_DATE.max()
'2024-06-30'
df.nunique()
FL_DATE 3653 OP_UNIQUE_CARRIER 20 TAIL_NUM 9140 ORIGIN_AIRPORT_SEQ_ID 58 ORIGIN_CITY_MARKET_ID 20 ORIGIN 24 ORIGIN_CITY_NAME 24 DEST_AIRPORT_SEQ_ID 58 DEST_CITY_MARKET_ID 20 DEST 24 DEST_CITY_NAME 24 DEP_TIME 1440 DEP_DELAY 1783 TAXI_OUT 195 TAXI_IN 247 ARR_TIME 1440 ARR_DELAY 1806 CANCELLED 2 CANCELLATION_CODE 4 CARRIER_DELAY 1558 WEATHER_DELAY 971 NAS_DELAY 855 SECURITY_DELAY 268 LATE_AIRCRAFT_DELAY 1228 dtype: int64
df.dtypes
FL_DATE object OP_UNIQUE_CARRIER object TAIL_NUM object ORIGIN_AIRPORT_SEQ_ID int64 ORIGIN_CITY_MARKET_ID int64 ORIGIN object ORIGIN_CITY_NAME object DEST_AIRPORT_SEQ_ID int64 DEST_CITY_MARKET_ID int64 DEST object DEST_CITY_NAME object DEP_TIME float64 DEP_DELAY float64 TAXI_OUT float64 TAXI_IN float64 ARR_TIME float64 ARR_DELAY float64 CANCELLED float64 CANCELLATION_CODE object CARRIER_DELAY float64 WEATHER_DELAY float64 NAS_DELAY float64 SECURITY_DELAY float64 LATE_AIRCRAFT_DELAY float64 dtype: object
Data Cleaning¶
MEDIUM_AIRPORT_CODE = [
"DAL",
"PDX",
"STL",
"RDU",
"HOU",
"SMF",
"MSY",
"SJC",
"SJU",
"SNA"
]
import matplotlib.pyplot as plt
target_df = df.drop(columns = [
"OP_UNIQUE_CARRIER",
"TAIL_NUM",
"ORIGIN_AIRPORT_SEQ_ID",
"DEST_AIRPORT_SEQ_ID",
"DEP_TIME",
"TAXI_OUT",
"TAXI_IN",
"ARR_TIME",
])
del df
"""
dest_df = df.drop(columns = [
"OP_UNIQUE_CARRIER",
"TAIL_NUM",
"ORIGIN_AIRPORT_SEQ_ID",
"ORIGIN_CITY_MARKET_ID",
"ORIGIN",
"ORIGIN_CITY_NAME",
"DEST_AIRPORT_SEQ_ID",
"DEST_CITY_MARKET_ID",
"DEP_TIME",
"TAXI_OUT",
"TAXI_IN",
"ARR_TIME",
])
"""
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df.isnull().mean()
FL_DATE 0.000000 ORIGIN_CITY_MARKET_ID 0.000000 ORIGIN 0.000000 ORIGIN_CITY_NAME 0.000000 DEST_CITY_MARKET_ID 0.000000 DEST 0.000000 DEST_CITY_NAME 0.000000 DEP_DELAY 0.016377 ARR_DELAY 0.018966 CANCELLED 0.000000 CANCELLATION_CODE 0.983244 CARRIER_DELAY 0.801989 WEATHER_DELAY 0.801989 NAS_DELAY 0.801989 SECURITY_DELAY 0.801989 LATE_AIRCRAFT_DELAY 0.801989 dtype: float64
airport_set_df = target_df.ORIGIN.drop_duplicates()
airport_review = pd.read_csv("../supplementary/Airport-Reviews-Table.csv")
airport_review = airport_review[airport_review.AIRPORT_CODE.isin(airport_set_df)]
airport_default_score = airport_review.groupby("AIRPORT_CODE")[[
'ratingValue',
'queueTime',
'terminalCleanliness',
'terminalSeating',
'terminalSign',
'foodBeverage',
'airportShopping',
'wifiConnectivity',
'airportStaff',
'sentiment'
]].mean().reset_index()
airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE")
airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE)
airport_review_merged.head()
| AIRPORT_CODE | reviewCount | ratingValue | queueTime | terminalCleanliness | terminalSeating | terminalSign | foodBeverage | airportShopping | wifiConnectivity | airportStaff | sentiment | Large | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ATL | 333 | 0.256456 | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
| 1 | CLT | 169 | 0.242012 | 1.958580 | 2.349112 | 2.019481 | 2.625767 | 1.951220 | 2.468468 | 2.100917 | 1.947020 | -0.715976 | True |
| 2 | DAL | 26 | 0.450000 | 3.173913 | 3.695652 | 3.150000 | 3.478261 | 3.304348 | 3.090909 | 3.714286 | 2.875000 | -0.153846 | False |
| 3 | DEN | 151 | 0.260265 | 1.847682 | 2.536424 | 2.253731 | 2.213333 | 2.006757 | 2.308511 | 2.557895 | 1.934307 | -0.761589 | True |
| 4 | DFW | 175 | 0.321143 | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
target_airport_df = pd.merge(
target_df,
airport_review_merged,
how = "inner",
left_on = "ORIGIN",
right_on = "AIRPORT_CODE",
suffixes = ["", "_origin"]
)
target_airport_df = pd.merge(
target_airport_df,
airport_review_merged,
how = "inner",
left_on = "DEST",
right_on = "AIRPORT_CODE",
suffixes = ["", "_dest"]
)
target_airport_df.rename(columns = {
'reviewCount':'reviewCount_origin',
'ratingValue':'ratingValue_origin',
'queueTime':'queueTime_origin',
'terminalCleanliness':'terminalCleanliness_origin',
'terminalSeating':'terminalSeating_origin',
'terminalSign':'terminalSign_origin',
'foodBeverage':'foodBeverage_origin',
'airportShopping':'airportShopping_origin',
'wifiConnectivity':'wifiConnectivity_origin',
'airportStaff':'airportStaff_origin',
'sentiment':'sentiment_origin',
'Large':'Large_origin'
}, inplace = True)
target_airport_df.drop(columns = ['AIRPORT_CODE', 'AIRPORT_CODE_dest'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
| FL_DATE | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_CITY_MARKET_ID | DEST | DEST_CITY_NAME | DEP_DELAY | ARR_DELAY | CANCELLED | ... | queueTime_dest | terminalCleanliness_dest | terminalSeating_dest | terminalSign_dest | foodBeverage_dest | airportShopping_dest | wifiConnectivity_dest | airportStaff_dest | sentiment_dest | Large_dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | 31057 | CLT | Charlotte, NC | 30194 | DFW | Dallas/Fort Worth, TX | -5.0 | 9.0 | 0.0 | ... | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
| 1 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 31057 | CLT | Charlotte, NC | -4.0 | 0.0 | 0.0 | ... | 1.958580 | 2.349112 | 2.019481 | 2.625767 | 1.951220 | 2.468468 | 2.100917 | 1.947020 | -0.715976 | True |
| 2 | 2014-07-01 | 30397 | ATL | Atlanta, GA | 30194 | DFW | Dallas/Fort Worth, TX | -2.0 | -9.0 | 0.0 | ... | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
| 3 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 30397 | ATL | Atlanta, GA | 14.0 | 4.0 | 0.0 | ... | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
| 4 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 30397 | ATL | Atlanta, GA | 1.0 | 2.0 | 0.0 | ... | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
5 rows × 40 columns
Visualization¶
Delay Trend¶
import plotly.express as px
delay_by_year = target_airport_df[[
'FL_DATE',
'CARRIER_DELAY',
'WEATHER_DELAY',
'NAS_DELAY',
'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY',
]]
delay_by_year['Year'] = delay_by_year['FL_DATE'].dt.year
delay_by_year = delay_by_year.groupby("Year")[[
'CARRIER_DELAY',
'WEATHER_DELAY',
'NAS_DELAY',
'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY',
]].mean().reset_index()
delay_by_year = delay_by_year.melt(id_vars = "Year", value_name = "Delay (m)")
fig = px.area(
delay_by_year,
x="Year",
y="Delay (m)",
color ='variable',
title='Airport Delay Trend in the Past 10Y',
)
fig.update_layout(
height=600,
)
fig.show()
C:\Users\wongh\AppData\Local\Temp\ipykernel_7724\370473652.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Airport Review¶
First of all, let's have a look at the Review Data we collected
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = airport_review_merged.loc[:, 'reviewCount':].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask)
plt.title('Airport Review Score Correlation')
plt.show()
Summary
The majority of the score given by users are very correlated,
with the exception of reviewCount of an airport, which inversely correlate with all other attributes.
Airport Ranking¶
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
fig = go.Figure()
fig = px.scatter(
data_frame = airport_review_merged,
x='sentiment',
y='ratingValue',
color='ratingValue',
size = 'reviewCount',
color_continuous_scale = 'viridis',
facet_col = "Large",
text='AIRPORT_CODE',
log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
height=600,
width=1000,
title_text="Airport Sentiment Visualization",
showlegend=False
)
fig.show()
Review Performance per Airport¶
df_columns = [
'queueTime',
'terminalCleanliness',
'terminalSeating',
'terminalSign',
'foodBeverage',
'airportShopping',
'wifiConnectivity',
'airportStaff',
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_columns)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column in zip(col, row, df_columns):
airport_review_merged.sort_values(column, ascending = False, inplace = True)
trace = go.Bar(
x=airport_review_merged['AIRPORT_CODE'],
y=airport_review_merged[column],
marker=dict(
color = airport_review_merged[column],
colorscale='viridis'
)
)
fig.add_trace(
trace,
row=r,
col=c
)
fig.update_layout(
height=1000,
title_text="Individual Scoring Attributes of Airports",
showlegend=False
)
fig.show()
Delay vs Ranking¶
train_columns = [
'queueTime_origin',
'terminalCleanliness_origin',
'terminalSeating_origin',
'terminalSign_origin',
'foodBeverage_origin',
'airportShopping_origin',
'wifiConnectivity_origin',
'airportStaff_origin',
'Large_origin',
'queueTime_dest',
'terminalCleanliness_dest',
'terminalSeating_dest',
'terminalSign_dest',
'foodBeverage_dest',
'airportShopping_dest',
'wifiConnectivity_dest',
'airportStaff_dest',
'Large_dest'
]
label_columns = [
"DEP_DELAY",
"ARR_DELAY",
"CARRIER_DELAY",
"WEATHER_DELAY",
"NAS_DELAY",
"SECURITY_DELAY",
"LATE_AIRCRAFT_DELAY",
]
Delay Data Size >0
cal_df = target_airport_df[['ORIGIN'] + label_columns]
cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 0).mean()).reset_index()
| ORIGIN | DEP_DELAY | ARR_DELAY | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|
| 0 | ATL | 0.391977 | 0.346680 | 0.102059 | 0.012486 | 0.098133 | 0.000559 | 0.070856 |
| 1 | CLT | 0.386671 | 0.377364 | 0.117649 | 0.012658 | 0.119450 | 0.002985 | 0.094161 |
| 2 | DAL | 0.511887 | 0.405451 | 0.138866 | 0.010460 | 0.092957 | 0.001003 | 0.135679 |
| 3 | DEN | 0.442719 | 0.386610 | 0.127630 | 0.010701 | 0.112139 | 0.000455 | 0.110875 |
| 4 | DFW | 0.417090 | 0.400279 | 0.137167 | 0.018804 | 0.118154 | 0.000864 | 0.105878 |
| 5 | EWR | 0.401861 | 0.362531 | 0.133538 | 0.011004 | 0.122859 | 0.000979 | 0.107049 |
| 6 | HOU | 0.461493 | 0.377129 | 0.120927 | 0.009179 | 0.084441 | 0.000970 | 0.115730 |
| 7 | JFK | 0.357083 | 0.337509 | 0.119247 | 0.010789 | 0.129858 | 0.001389 | 0.068300 |
| 8 | LAS | 0.400518 | 0.375535 | 0.111114 | 0.005143 | 0.108687 | 0.000699 | 0.113227 |
| 9 | LAX | 0.384964 | 0.363739 | 0.104485 | 0.004377 | 0.102161 | 0.000770 | 0.096527 |
| 10 | MCO | 0.390880 | 0.372180 | 0.110292 | 0.020802 | 0.120200 | 0.001602 | 0.106974 |
| 11 | MIA | 0.405724 | 0.393572 | 0.136413 | 0.019572 | 0.126643 | 0.001830 | 0.099147 |
| 12 | MSY | 0.360981 | 0.338699 | 0.087303 | 0.010217 | 0.093822 | 0.000673 | 0.106858 |
| 13 | ORD | 0.405576 | 0.382400 | 0.121694 | 0.021180 | 0.129221 | 0.000579 | 0.098953 |
| 14 | PDX | 0.298812 | 0.321973 | 0.066115 | 0.004676 | 0.084625 | 0.000397 | 0.068284 |
| 15 | PHX | 0.392451 | 0.362283 | 0.101458 | 0.003507 | 0.090475 | 0.001300 | 0.098925 |
| 16 | RDU | 0.344065 | 0.332790 | 0.086812 | 0.008671 | 0.104936 | 0.000614 | 0.091356 |
| 17 | SEA | 0.351422 | 0.362239 | 0.082688 | 0.006774 | 0.105163 | 0.000757 | 0.068566 |
| 18 | SFO | 0.369786 | 0.367693 | 0.098870 | 0.004128 | 0.108372 | 0.000497 | 0.107824 |
| 19 | SJC | 0.340300 | 0.339721 | 0.079905 | 0.003123 | 0.076541 | 0.000883 | 0.098579 |
| 20 | SJU | 0.347668 | 0.355347 | 0.117447 | 0.002182 | 0.102229 | 0.001834 | 0.121220 |
| 21 | SMF | 0.358445 | 0.343033 | 0.077687 | 0.003122 | 0.080243 | 0.000597 | 0.100190 |
| 22 | SNA | 0.352813 | 0.332351 | 0.068681 | 0.004160 | 0.083964 | 0.000328 | 0.098166 |
| 23 | STL | 0.390224 | 0.347997 | 0.098000 | 0.008861 | 0.093257 | 0.000460 | 0.101216 |
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()
plt.figure(figsize=(10, 5))
label_columns.remove('NAS_DELAY')
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays (Without NAS)')
plt.show()
Departure Delay Per Airport¶
for airport_direction in ["ORIGIN", "DEST"]:
airport_temp = target_airport_df.sample(1000000, replace = False).groupby(airport_direction)['DEP_DELAY'].quantile(0.75).reset_index()
airport_temp_merged = pd.merge(
target_airport_df.sample(100000, replace = False),
airport_temp, how = "inner",
on = airport_direction,
suffixes = ["", "_mean"]
)
airport_temp_merged['DEP_DELAY'] = airport_temp_merged['DEP_DELAY'].apply(lambda x: 0 if x < 0 else x)
fig = px.box(
airport_temp_merged.sort_values([
f"Large_{airport_direction.lower()}",
"DEP_DELAY_mean"
], ascending = False),
x=airport_direction,
y="DEP_DELAY",
log_y = True,
color = f"Large_{airport_direction.lower()}",
color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
height=400,
title_text=f"{airport_direction} Aiport Departure Delay",
showlegend=True
)
fig.show()
Cancelled¶
train_columns = [
'queueTime_origin',
'terminalCleanliness_origin',
'terminalSeating_origin',
'terminalSign_origin',
'foodBeverage_origin',
'airportShopping_origin',
'wifiConnectivity_origin',
'airportStaff_origin',
'Large_origin',
'queueTime_dest',
'terminalCleanliness_dest',
'terminalSeating_dest',
'terminalSign_dest',
'foodBeverage_dest',
'airportShopping_dest',
'wifiConnectivity_dest',
'airportStaff_dest',
'Large_dest'
]
label_columns = [
"Cancellation_A",
"Cancellation_B",
"Cancellation_C",
"Cancellation_D",
"Cancellation_Not"
]
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
temp = pd.concat([
target_airport_df,
pd.get_dummies(target_airport_df['CANCELLATION_CODE'], prefix = "Cancellation")
], axis = 1)
corr_matrix = temp.loc[:, train_columns + label_columns].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
import numpy as np
import math
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
def preprocess(df, y_columns='ARR_DELAY'):
X = df.loc[:, train_columns]
y = df.loc[:, y_columns]
return X, y
train, test = train_test_split(target_airport_df, test_size=0.2)
train_X, train_y = preprocess(train, "CANCELLATION_CODE")
test_X, test_y = preprocess(test, "CANCELLATION_CODE")
min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)
model = LogisticRegression(
multi_class='ovr',
max_iter=1000,
class_weight="balanced"
)
model.fit(train_X_scaled, train_y)
C:\Users\wongh\anaconda3\envs\601\lib\site-packages\sklearn\linear_model\_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')
import seaborn as sn
import matplotlib.pyplot as plt
train_predictions = model.predict(train_X_scaled)
test_predictions = model.predict(test_X_scaled)
train_conf_matrix = confusion_matrix(train_y, train_predictions)
test_conf_matrix = confusion_matrix(test_y, test_predictions)
sn.heatmap(train_conf_matrix, annot=True, fmt='g')
plt.show()
sn.heatmap(test_conf_matrix, annot=True, fmt='g')
plt.show()
train_y_encoded = label_binarize(train_y, classes=model.classes_)
test_y_encoded = label_binarize(test_y, classes=model.classes_)
train_probs = model.predict_proba(train_X_scaled)
test_probs = model.predict_proba(test_X_scaled)
fpr_train, tpr_train, _ = roc_curve(train_y_encoded.ravel(), train_probs.ravel())
fpr_test, tpr_test, _ = roc_curve(test_y_encoded.ravel(), test_probs.ravel())
train_auc = roc_auc_score(train_y_encoded, train_probs, average='micro')
test_auc = roc_auc_score(test_y_encoded, test_probs, average='micro')
def roc(fpr, tpr, auc):
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(
fpr,
tpr,
label=f'Train ROC curve (area = {auc:.2})'
)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Train ROC Curve')
plt.legend(loc="lower right")
roc(fpr_train, tpr_train, train_auc)
roc(fpr_test, tpr_test, test_auc)
weight = model.coef_
fig, axs = plt.subplots(5, 1, figsize=(10, 20))
for i in range(5):
ax = axs[i]
ax.barh(train_columns, weight[i])
ax.set_title(f'Bar Chart {label_columns[i]}')
plt.tight_layout()
plt.show()
from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import preprocessing from tqdm import tqdm import numpy as np import math
def preprocess(df, y_columns = 'ARR_DELAY'): df.dropna(subset = y_columns, inplace = True) X = df.loc[:, train_columns] y = df.loc[:, y_columns] return X, y
def experiment(df, y_columns = 'ARR_DELAY'): train, test = train_test_split(df, test_size=0.2)
train_X, train_y = preprocess(train, y_columns)
test_X, test_y = preprocess(test, y_columns)
min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)
model = LinearRegression()
model.fit(train_X_scaled, train_y)
train_r_score = model.score(train_X_scaled, train_y)
test_r_score = model.score(test_X_scaled, test_y)
return train_r_score, test_r_score, model.coef_
Redacted¶
from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import preprocessing from tqdm import tqdm import numpy as np import math
def preprocess(df, y_columns = 'ARR_DELAY'): df.dropna(subset = y_columns, inplace = True) X = df.loc[:, train_columns] y = df.loc[:, y_columns] return X, y
def experiment(df, y_columns = 'ARR_DELAY'): train, test = train_test_split(df, test_size=0.2)
train_X, train_y = preprocess(train, y_columns)
test_X, test_y = preprocess(test, y_columns)
min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)
model = LinearRegression()
model.fit(train_X_scaled, train_y)
train_r_score = model.score(train_X_scaled, train_y)
test_r_score = model.score(test_X_scaled, test_y)
return train_r_score, test_r_score, model.coef_
Arrival Delay Reasoning¶
train_r_list, test_r_list, coef_list = list(), list(), list() for i in tqdm(range(EXPERIMENT_COUNT)): np.random.seed(i) train_r_score, test_r_score, coef = experiment(target_airport_df, 'LOG_ARR_DELAY') train_r_list.append(train_r_score) test_r_list.append(test_r_score) coef_list.append(coef)
import plotly.express as px
coef_df_list = list()
for i, array in enumerate(coef_list): coef_df = pd.DataFrame({ "Feature": train_columns, "Weight": array }) coef_df_list.append(coef_df)
result_df = pd.concat(coef_df_list)
fig = px.box( result_df, x="Feature", y="Weight", ) fig.update_layout( height=600, title_text="Aiport Rating vs Arrival Delay", showlegend=False ) fig.show()
train_r_list, test_r_list
([0.0027769056925236457, 0.002726794565260926, 0.002739984286952546, 0.0027382477593851373, 0.002750673837592532, 0.0027616015050112575, 0.002722193250161431, 0.002763664872232119, 0.0027681168845542503, 0.002773670861719535], [0.0026226756634710746, 0.0028214286965265734, 0.0027696415080686876, 0.002776677393967586, 0.0027283399081202697, 0.0026838710483376715, 0.0028416825308008598, 0.0026738151940109978, 0.00265653353537032, 0.002636301651983919])
Departure Delay Reasoning¶
train_r_list, test_r_list, coef_list = list(), list(), list() for i in tqdm(range(EXPERIMENT_COUNT)): np.random.seed(i) train_r_score, test_r_score, coef = experiment(target_airport_df, 'DEP_DELAY') train_r_list.append(train_r_score) test_r_list.append(test_r_score) coef_list.append(coef)
import plotly.express as px
coef_df_list = list()
for i, array in enumerate(coef_list): coef_df = pd.DataFrame({ "Feature": train_columns, "Weight": array }) coef_df_list.append(coef_df)
result_df = pd.concat(coef_df_list)
fig = px.box( result_df, x="Feature", y="Weight", ) fig.update_layout( height=600, title_text="Aiport Rating vs Departure Delay", showlegend=False ) fig.show()
import pandas as pd import plotly.graph_objects as go import plotly.express as px
fig = go.Figure()
coef_df.sort_values("Weight", ascending = False, inplace = True) fig = px.bar( data_frame = coef_df, x='Feature', y='Weight', color='Weight', color_continuous_scale = 'viridis', ) fig.update_layout( height=600, width=1000, title_text="Airport Sentiment Visualization", showlegend=False ) fig.show()
import math
target_airport_df["LOG_ARR_DELAY"] = target_airport_df["ARR_DELAY"].apply(lambda x: 0 if x <= 0 else math.log(x)) target_airport_df["LOG_DEP_DELAY"] = target_airport_df["DEP_DELAY"].apply(lambda x: 0 if x <= 0 else math.log(x)) target_airport_df["LOG_ARR_DELAY"]